Bulk RNASeq data from MSBB.

# gaussian to run the linear regression 
pheno_list = c("CERAD" = "gaussian",
               "Braak" = "gaussian",
               "CDR" = "gaussian",
               "plaqueMean" = "gaussian"
               )
work_dir = "/pastel/Github_scripts/SpeakEasy_dlpfc/bulk_dlpfc/replication/"
resources = "/pastel/resources/MSBB/RNAseq/"
net_dir = "/pastel/projects/speakeasy_dlpfc/SpeakEasy_net_MF/"

library(tidyverse)
library(janitor)
library(WGCNA)

We already have the modules from our ROSMAP dataset, so I’ll use this as gene list and calculate the eigengenes and module average expression for the same genes for the MSBB. Then, we’ll run regressions with their AD-related covariates to check for associations.

Input external dataset

syn27068756

# metadata 
metadata = read.csv(paste0(resources, "RNAseq_Harmonization_MSBB_combined_metadata.csv"))
metadata$ageDeath = as.numeric(gsub("\\+", "", metadata$ageDeath))

message(paste0("Unique individualID: ", length(unique(metadata$individualID)))) 
## Unique individualID: 316
message(paste0("Unique specimenID: ", length(unique(metadata$specimenID)))) 
## Unique specimenID: 1282
table(metadata$tissue)
## 
##            frontal pole  inferior frontal gyrus   parahippocampal gyrus 
##                     310                     308                     315 
##       prefrontal cortex superior temporal gyrus 
##                      15                     334

Select only one region

To avoid repeated measures.

metadata_temp = metadata[metadata$exclude == "FALSE", ] # Remove the samples that didn't pass QC 
metadata_sel = metadata_temp[metadata_temp$tissue == "frontal pole", ] # select the frontal pole (BA = 10)

# expression matrix 
expr_mtx = as.data.frame(read_tsv(paste0(resources, "MSBB_Normalized_counts_CQN.tsv"), show_col_types = F))
rownames(expr_mtx) = expr_mtx$feature
expr_mtx_sel = expr_mtx[, colnames(expr_mtx) %in% metadata_sel$specimenID] # select the same samples from the metadata 

Input ROSMAP networks

Bulk DLPFC.

modules_file = read.table(paste0(net_dir, "geneBycluster.txt"), header = T)

modules_size = as.data.frame( table(modules_file$cluster_lv3))
colnames(modules_size) = c("module", "n_nodes")
too_small = as.character( modules_size$module[modules_size$n_nodes < 30]) # Get modules < 30 nodes to be removed
net_output = modules_file %>% filter(! cluster_lv3 %in% as.integer(too_small)) # the clusters with at least 30 nodes 
net_output = net_output[,c("ensembl", "cluster_lv3", "gene_type", "symbol")]
net_output$ensembl2 = gsub("(.*)\\.(.*)", "\\1",net_output$ensembl)

message(paste0("Number of modules: ", length(unique(net_output$cluster_lv3))))
## Number of modules: 34

Average expression - MSBB

I got our gene list and calculated the ME for the MSBB expression.

gene_mod_map = data.frame(ensembl = rownames(expr_mtx_sel))
gene_mod_map = gene_mod_map %>% inner_join(net_output[,-1], by = c("ensembl" = "ensembl2"))

expr_mtx_sel_ord = expr_mtx_sel[gene_mod_map$ensembl,] # order the matrix
identical(rownames(expr_mtx_sel_ord), gene_mod_map$ensembl) # must be TRUE 
## [1] TRUE
colors_mod = gene_mod_map$cluster_lv3
expr_mtx_sel_ord_t = as.data.frame(t(expr_mtx_sel_ord))
external_ME = moduleEigengenes(expr_mtx_sel_ord_t, colors = colors_mod)
mod_average = external_ME$averageExpr # data.frame

# save results
# save(external_ME, metadata_sel, file = paste0(work_dir, "MSBB_moduleEigengenes_lv3.Rdata"))
# write.table(external_ME$eigengenes, file = paste0(work_dir, "MSBB_moduleEigengenes_lv3.txt"), sep = "\t", quote = F, row.names = T)
# write.table(external_ME$averageExpr, file = paste0(work_dir, "MSBB_moduleaverageExpr_lv3.txt"), sep = "\t", quote = F, row.names = T)

createDT(mod_average)

Regressions

Input: Average expression by module.

Numbers and colors : -log10(nominal pvalue)

cutpoints:

< 0.001 = ***

0.01 = **

0.05 = *

0.1 = .

1 = ” ”

data4linear_reg <- mod_average # match codes name 
phenotype_dt = metadata_sel[match(rownames(data4linear_reg), metadata_sel$specimenID), ]
all(rownames(data4linear_reg) == phenotype_dt$specimenID) # Must be TRUE. Match IDs
## [1] TRUE
# hist(phenotype_dt$Braak)
message(paste0("Unique individualID: ", length(unique(phenotype_dt$individualID)))) 
## Unique individualID: 278
message(paste0("Unique specimenID: ", length(unique(phenotype_dt$specimenID)))) 
## Unique specimenID: 304
res_test = run_module_trait_association(data4linear_reg, phenotype_dt, pheno_list, covariates = c("ageDeath","sex", "ethnicity")) # covariates to adjust
matrix_rsquared = res_test$matrix_rsquared
matrix_pvalue = res_test$matrix_pvalue

### Get modules >= 30 nodes to show:
to_show = paste0("AE",as.character( modules_size$module[modules_size$n_nodes >= 30])) 

# pdf("/pastel/Github_scripts/SpeakEasy_dlpfc/figures4paper/reg_MSBB_adjcov.pdf", width = 3, height = 11)
plot_module_trait_association_heatmap(res_test, to_show)

# dev.off()

Top results

Top result by covariate.

# res_test$all_stats_df 
createDT(res_test$all_stats_df %>% group_by(phenotype) %>% slice_head(n = 1))

Nominal pvalue

createDT(matrix_pvalue)

Significant results

Adj P < 0.05

Threshold: At least one module with adjusted pvalue < 0.05.

Method: Bonferroni by column.

plot_module_trait_association_heatmap(res_test, to_show, show_only_significant = T, signif_cutoff = 0.05)
## Adjusting P-values by Bonferroni correction by each phenotype separately. Double check if rows of res_test$matrix_pvalue are phenotypes and columns are your features (e.g. modules)
## Adjusting P-values by Bonferroni correction by each phenotype separately. Double check if rows of res_test$matrix_pvalue are phenotypes and columns are your features (e.g. modules)
## Adjusting P-values by Bonferroni correction by each phenotype separately. Double check if rows of res_test$matrix_pvalue are phenotypes and columns are your features (e.g. modules)
## Adjusting P-values by Bonferroni correction by each phenotype separately. Double check if rows of res_test$matrix_pvalue are phenotypes and columns are your features (e.g. modules)

Adj P < 0.000001

plot_module_trait_association_heatmap(res_test, to_show, show_only_significant = T, signif_cutoff = 0.000001)
## Adjusting P-values by Bonferroni correction by each phenotype separately. Double check if rows of res_test$matrix_pvalue are phenotypes and columns are your features (e.g. modules)
## Adjusting P-values by Bonferroni correction by each phenotype separately. Double check if rows of res_test$matrix_pvalue are phenotypes and columns are your features (e.g. modules)
## Adjusting P-values by Bonferroni correction by each phenotype separately. Double check if rows of res_test$matrix_pvalue are phenotypes and columns are your features (e.g. modules)
## Adjusting P-values by Bonferroni correction by each phenotype separately. Double check if rows of res_test$matrix_pvalue are phenotypes and columns are your features (e.g. modules)

## MF_M3 vs CERAD 
mod_average = external_ME$averageExpr # data.frame
metadata_sel2 = metadata_sel[match(rownames(mod_average), metadata_sel$specimenID), ]
all(rownames(mod_average) == metadata_sel2$specimenID) # must be TRUE
mod_average$specimenID = rownames(mod_average)
data4reg = mod_average %>% left_join(metadata_sel2, by = "specimenID")

results_lm = lm(formula = AE3 ~ CERAD + ageDeath + sex + ethnicity, data = data4reg)# no repeated measures
summary(results_lm)
res_test$all_stats_df$adj_pvalue = p.adjust(res_test$all_stats_df$nom_p, 
                                            method = "fdr")

createDT(res_test$all_stats_df)

Session info

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3    circlize_0.4.16       ComplexHeatmap_2.15.4
##  [4] performance_0.12.2    lmerTest_3.1-3        lme4_1.1-35.1        
##  [7] Matrix_1.6-5          WGCNA_1.72-5          fastcluster_1.2.6    
## [10] dynamicTreeCut_1.63-1 janitor_2.2.0         lubridate_1.9.3      
## [13] forcats_1.0.0         stringr_1.5.1         dplyr_1.1.4          
## [16] purrr_1.0.2           readr_2.1.5           tidyr_1.3.1          
## [19] tibble_3.2.1          ggplot2_3.5.1         tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##   [1] minqa_1.2.7            colorspace_2.1-1       rjson_0.2.21          
##   [4] snakecase_0.11.1       htmlTable_2.4.3        XVector_0.34.0        
##   [7] GlobalOptions_0.1.2    base64enc_0.1-3        clue_0.3-65           
##  [10] rstudioapi_0.16.0      DT_0.33                bit64_4.0.5           
##  [13] AnnotationDbi_1.56.2   fansi_1.0.6            codetools_0.2-18      
##  [16] splines_4.1.2          doParallel_1.0.17      cachem_1.1.0          
##  [19] impute_1.68.0          knitr_1.48             Formula_1.2-5         
##  [22] jsonlite_1.8.8         nloptr_2.1.1           Cairo_1.6-2           
##  [25] cluster_2.1.2          GO.db_3.14.0           png_0.1-8             
##  [28] compiler_4.1.2         httr_1.4.7             backports_1.5.0       
##  [31] fastmap_1.2.0          cli_3.6.3              htmltools_0.5.8.1     
##  [34] tools_4.1.2            gtable_0.3.5           glue_1.7.0            
##  [37] GenomeInfoDbData_1.2.7 reshape2_1.4.4         Rcpp_1.0.13           
##  [40] Biobase_2.54.0         jquerylib_0.1.4        vctrs_0.6.5           
##  [43] Biostrings_2.62.0      preprocessCore_1.61.0  nlme_3.1-153          
##  [46] iterators_1.0.14       crosstalk_1.2.1        insight_0.20.2        
##  [49] xfun_0.46              timechange_0.3.0       lifecycle_1.0.4       
##  [52] zlibbioc_1.40.0        MASS_7.3-60.0.1        scales_1.3.0          
##  [55] vroom_1.6.5            hms_1.1.3              parallel_4.1.2        
##  [58] yaml_2.3.10            memoise_2.0.1          gridExtra_2.3         
##  [61] sass_0.4.9             rpart_4.1-15           stringi_1.8.4         
##  [64] RSQLite_2.3.7          highr_0.11             S4Vectors_0.32.4      
##  [67] foreach_1.5.2          checkmate_2.3.2        BiocGenerics_0.40.0   
##  [70] boot_1.3-28            shape_1.4.6.1          GenomeInfoDb_1.30.1   
##  [73] rlang_1.1.4            pkgconfig_2.0.3        matrixStats_1.3.0     
##  [76] bitops_1.0-8           evaluate_0.24.0        lattice_0.20-45       
##  [79] htmlwidgets_1.6.4      bit_4.0.5              tidyselect_1.2.1      
##  [82] plyr_1.8.9             magrittr_2.0.3         R6_2.5.1              
##  [85] magick_2.8.4           IRanges_2.28.0         generics_0.1.3        
##  [88] Hmisc_5.1-3            DBI_1.2.3              pillar_1.9.0          
##  [91] foreign_0.8-81         withr_3.0.0            survival_3.7-0        
##  [94] KEGGREST_1.34.0        RCurl_1.98-1.16        nnet_7.3-16           
##  [97] crayon_1.5.3           utf8_1.2.4             tzdb_0.4.0            
## [100] rmarkdown_2.27         GetoptLong_1.0.5       data.table_1.15.4     
## [103] blob_1.2.4             digest_0.6.36          numDeriv_2016.8-1.1   
## [106] stats4_4.1.2           munsell_0.5.1          bslib_0.8.0